library(ggplot2)
library(HardyWeinberg)
## Loading required package: mice
## Loading required package: lattice
##
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
##
## cbind, rbind
## Loading required package: Rsolnp
library(ggfortify)
library(MASS)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:HardyWeinberg':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(reshape2)
library("gridExtra")
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
geno <- read.csv('./data_files/genotypes.csv', stringsAsFactors = FALSE, header = TRUE, row.names = 1) ## 344 50000
pheno <- read.csv('./data_files/phenotypes.csv', stringsAsFactors = FALSE, header = TRUE, row.names = 1) ## 344 5
covar <- read.csv('./data_files/covars.csv', stringsAsFactors = FALSE, header = TRUE, row.names = 1) ## 344 2
## Check the rownames of geno, pheno and covar files are consistent
sum(rownames(geno) == rownames(pheno)) ## 344
## [1] 344
sum(rownames(geno) == rownames(covar)) ## 344
## [1] 344
calculate the number of samples n and the numebr of SNPs N
n = dim(geno)[1]
N = dim(geno)[2]
make sure the genotype file looks good
geno[1:6,1:6]
## rs10399749 rs62641299 rs115523412 rs75932129 rs10900604 rs58686784
## HG00096 0 2 0 1 1 1
## HG00097 0 2 0 2 0 1
## HG00099 1 1 0 0 1 1
## HG00100 0 2 0 2 0 0
## HG00101 1 2 0 1 1 0
## HG00102 0 2 0 1 0 0
make sure the phenotype file looks good
head(pheno)
## ENSG00000164308.12 ENSG00000124587.9 ENSG00000180185.7
## HG00096 -1.166337 -0.68593575 -0.3062421
## HG00097 -1.045803 -0.17895757 -1.0333305
## HG00099 1.045803 -0.89457468 -2.5242603
## HG00100 0.223446 -0.09824326 1.1521112
## HG00101 -0.223446 0.86251206 1.3782830
## HG00102 -1.457684 -0.32916763 -0.1863454
## ENSG00000168827.9 ENSG00000136536.9
## HG00096 0.9275844 0.36000876
## HG00097 -0.6676644 -1.77825296
## HG00099 -2.7590424 -2.52426034
## HG00100 0.2533471 0.05451891
## HG00101 1.0842344 0.26085685
## HG00102 -1.1663369 -2.37832893
make sure the covar file looks good
head(covar)
## Population Sex
## HG00096 GBR MALE
## HG00097 GBR FEMALE
## HG00099 GBR FEMALE
## HG00100 GBR FEMALE
## HG00101 GBR MALE
## HG00102 GBR FEMALE
The gene name for the genes measured in the phenotype file
symbol <- c('ERAP2', 'PEX6', 'FAHD1', 'GFM1', 'MARCH7')
for (i in 1:dim(pheno)[2]){
p <- ggplot(pheno, aes(x=pheno[,i])) + geom_density()
p <- p + ggtitle(paste0('density plot for gene',i,'_' ,symbol[i]))
p <- p + xlab(paste0(symbol[i], '_relative expression'))
plot(p)
ggsave(paste0('./gene',i,'_expr_density.pdf'), width=4, height=3)
}
geno_covar <- merge(geno, covar, by = 'row.names')
rownames(geno_covar) <- geno_covar$Row.names
geno_covar <- geno_covar[,-1]
geno_covar$pop_sex <- interaction(geno_covar$Population, geno_covar$Sex)
table(geno_covar$pop_sex)
##
## CEU.FEMALE FIN.FEMALE GBR.FEMALE TSI.FEMALE CEU.MALE FIN.MALE
## 37 55 46 43 41 34
## GBR.MALE TSI.MALE
## 39 49
pheno_covar <- merge(pheno, covar, by = 'row.names')
rownames(pheno_covar) <- pheno_covar$Row.names
pheno_covar <- pheno_covar[,-1]
pheno_covar$pop_sex <- interaction(pheno_covar$Population, pheno_covar$Sex)
table(pheno_covar$pop_sex)
##
## CEU.FEMALE FIN.FEMALE GBR.FEMALE TSI.FEMALE CEU.MALE FIN.MALE
## 37 55 46 43 41 34
## GBR.MALE TSI.MALE
## 39 49
df <- geno_covar[,c(1:50000)]
autoplot(prcomp(df), data = geno_covar, colour = 'Population', main = 'PCA plot_genotype -- population')
ggsave(paste0('./PCA plot_genotype -- population.pdf'), width=6, height=4)
autoplot(prcomp(df), data = geno_covar, colour = 'Sex', main = 'PCA plot_genotype -- gender')
ggsave(paste0('./PCA plot_genotype -- gender.pdf'), width=6, height=4)
autoplot(prcomp(df), data = geno_covar, colour = 'pop_sex', main = 'PCA plot_genotype -- population & gender')
ggsave(paste0('./PCA plot_genotype -- population & gender.pdf'), width=6, height=4)
#pca.result <- prcomp(geno)
#pca.result$sdev
#summary(pca.result)
#pcaDf <- data.frame(pc1=pca.result$x[,1], pc2=pca.result$x[,2])
#ggplot(pcaDf,aes(pc1,pc2)) + geom_point() + ggtitle("Genotypes with Population Structure")
for (i in 1:dim(pheno)[2]){
p1 <- ggplot(pheno_covar, aes(x=pheno_covar[,i], colour=Population)) + geom_density() + xlab(paste0(symbol[i], '_relative expression'))
p1 <- p1 + ggtitle(paste0('density plot for gene',i,'_' ,symbol[i], ' by population'))
plot(p1)
ggsave(paste0('density plot for gene',i,'_' ,symbol[i], ' by population.pdf'), width=5, height=4)
p2 <- ggplot(pheno_covar, aes(x=pheno_covar[,i], colour=Sex)) + geom_density() + xlab(paste0(symbol[i], '_relative expression'))
p2 <- p2 + ggtitle(paste0('density plot for gene',i,'_' ,symbol[i], ' by gender'))
plot(p2)
ggsave(paste0('density plot for gene',i,'_' ,symbol[i], ' by gender.pdf'), width=5, height=4)
p3 <- ggplot(pheno_covar, aes(x=pheno_covar[,i], colour=pop_sex)) + geom_density() + xlab(paste0(symbol[i], '_relative expression'))
p3 <- p3 + ggtitle(paste0('density plot for gene',i,'_' ,symbol[i], ' by population & gender'))
plot(p3)
ggsave(paste0('density plot for gene',i,'_' ,symbol[i], ' by population and gender.pdf'), width=5, height=4)
}
df <- pheno_covar[,c(1:5)]
autoplot(prcomp(df), data = pheno_covar, colour = 'Population', main = 'PCA plot_phenotype -- population')
ggsave(paste0('./PCA plot_phenotype -- population.pdf'), width=6, height=4)
autoplot(prcomp(df), data = pheno_covar, colour = 'Sex', main = 'PCA plot_pheno -- gender')
ggsave(paste0('./PCA plot_phenotype -- gender.pdf'), width=6, height=4)
autoplot(prcomp(df), data = pheno_covar, colour = 'pop_sex', main = 'PCA plot_pheno -- population & gender')
ggsave(paste0('./PCA plot_phenotype -- population & gender.pdf'), width=6, height=4)
maf_cal <- function(geno_col){
x = sum(geno_col)/(2*length(geno_col))
if (x >= 0.5){
return(1-x)
} else if (x < 0.5){
return(x)
}
}
maf <- apply(geno, 2, maf_cal)
sum(maf > 0.05)
## [1] 50000
p <- ggplot() + geom_histogram(aes(x = maf), color = 'black', fill = "darkblue", bins = 100)
p <- p + xlab('minor allele frequency')
p <- p + ylab('counts')
p <- p + ggtitle('Histogram for the minor allele frequency')
p
ggsave(paste0('./MAF.pdf'), width=6, height=4)
xa_converter <- function(x){
if (x == 0){
return(-1)
} else if (x == 1) {
return(0)
} else if (x == 2){
return(1)
}
}
xd_converter <- function(x){
if (x == 0 | x == 2){
return(-1)
} else if (x == 1){
return(1)
}
}
Xa <- matrix(0, nrow = dim(geno)[1], ncol = dim(geno)[2])
Xd <- matrix(0, nrow = dim(geno)[1], ncol = dim(geno)[2])
for (i in c(1:N)){
if (sum(geno[,i]/(2*n)) >= 0.5) {
Xa[,i] <- sapply(geno[,i], xa_converter)
} else if (sum(geno[,i]/(2*n)) < 0.5) {
Xa[,i] <- sapply(geno[,i], xa_converter)
Xa[,i] <- Xa[,i] * (-1)
}
}
Xd <- apply(geno, 1:2, xd_converter)
colnames(Xd) <- NULL
rownames(Xd) <- NULL
pval_F_statistic <- function(Xa, Xd, y) {
output <- as.numeric()
for (i in 1:N){
x <- matrix(c(matrix(1, nrow = n), Xa[,i], Xd[,i]), nrow = n)
y <- matrix(y, nrow = n)
MLE <- ginv(t(x)%*%x)%*%t(x)%*%y
y_pred <- x%*%MLE
SSM <- sum((y_pred-mean(y))^2)
DFM <- 3-1
SSE <- sum((y_pred-y)^2)
DFE <- n-3
MSM <- SSM/DFM
MSE <- SSE/DFE
F <- MSM/MSE
output[i] <- pf(F, 2, n-3, lower.tail = FALSE)
}
return(output)
}
pval_mx1 <- matrix(0,nrow = dim(geno)[2], ncol = dim(pheno)[2])
pval_df1 <- as.data.frame(pval_mx1)
rownames(pval_df1) <- colnames(geno)
colnames(pval_df1) <- colnames(pheno)
for (i in 1:dim(pheno)[2]){
pval_df1[,i] <- pval_F_statistic(Xa, Xd, pheno[,i])
}
p-value heatmap
autoplot(pval_mx1, main = 'p-value heatmap for linear regression model without covariates')
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
ggsave(paste0('./p-value heatmap_1.pdf'), width=6, height=4)
snp <- read.csv('./data_files/SNP_info.csv', header = TRUE, stringsAsFactors = FALSE)
sum(snp$id == colnames(geno))
## [1] 50000
sum(snp$id == rownames(pval_df1))
## [1] 50000
rownames(snp) <- snp$id
pval_df1 <- merge(pval_df1, snp, by = 'row.names', sort = FALSE)
rownames(pval_df1) <- pval_df1$Row.names
pval_df1 <- pval_df1[,-1]
pval_df1$chr <- as.factor(paste0('chr',pval_df1$chromosome))
pval_df1$locus = c(1:N)
for (i in 1:5){
p <- ggplot(data = pval_df1) + geom_point(aes(x = locus, y = -log10(pval_df1[,i]), color = chr), size = 1)
p <- p + xlab("locus")
p <- p + ylab("-log10(p-value)")
p <- p + ggtitle(paste0('Manhattan Plot for gene',i,'_' , symbol[i]))
p <- p + geom_hline(yintercept = -log10(0.05/N), color = 'red')
plot(p)
ggsave(paste0('./gene',i,'_', symbol[i],'_manhattan_plot.pdf'), width=12, height=3)
}
compute the potential causal SNPs for each gene
gene1_sig <- rownames(pval_df1)[which(pval_df1[,1] < 0.05/N)] ##ENSG00000164308.12_ERAP2
gene1_sig_idx <- which(pval_df1[,1] < 0.05/N)
length(gene1_sig)
## [1] 73
gene2_sig <- rownames(pval_df1)[which(pval_df1[,2] < 0.05/N)] ## ENSG00000124587.9_PEX6
gene2_sig_idx <- which(pval_df1[,2] < 0.05/N)
length(gene2_sig)
## [1] 29
gene3_sig <- rownames(pval_df1)[which(pval_df1[,3] < 0.05/N)] ## ENSG00000180185.7_FAHD1
gene3_sig_idx <- which(pval_df1[,3] < 0.05/N)
length(gene3_sig)
## [1] 90
gene4_sig <- rownames(pval_df1)[which(pval_df1[,4] < 0.05/N)] ## ENSG00000168827.9_GFM1
gene4_sig_idx <- which(pval_df1[,4] < 0.05/N)
length(gene4_sig)
## [1] 0
gene5_sig <- rownames(pval_df1)[which(pval_df1[,5] < 0.05/N)] ## ENSG00000136536.9_MARCH7
gene5_sig_idx <- which(pval_df1[,5] < 0.05/N)
length(gene5_sig)
## [1] 0
#pval_df1$expected_pval = sort(-log10(seq(0,1,length.out = N)))
expected_pval = sort(-log10(seq(0,1,length.out = N)))
for (i in 1:dim(pheno)[2]){
p <- ggplot() + geom_point(aes(x = expected_pval, y = sort(-log10(pval_df1[,i]))))
p <- p + xlab("-log10(expected p-values)")
p <- p + ylab("-log10(observed p-values)")
p <- p + ggtitle(paste0("QQ plot for gene",i,'_' , symbol[i]))
p <- p + geom_abline(slope = 1, intercept = 0, color = 'red')
plot(p)
ggsave(paste0('./gene',i,'_',symbol[i],'_QQ_plot.pdf'), width=5, height=4)
}
for (i in 1:3){
p <- ggplot() + geom_point(aes(x = expected_pval, y = sort(-log10(pval_df1[,i]))))
p <- p + xlab("-log10(expected p-values)")
p <- p + ylab("-log10(observed p-values)")
p <- p + ggtitle(paste0("QQ plot for gene",i,'_', symbol[i]))
p <- p + geom_abline(slope = 1, intercept = 0, color = 'red')
p <- p + ylim(c(0,10))
plot(p)
ggsave(paste0('./gene',i,'_', symbol[i], '_QQ_plot_2.pdf'), width=5, height=4)
}
## Warning: Removed 54 rows containing missing values (geom_point).
## Warning: Removed 54 rows containing missing values (geom_point).
## Warning: Removed 17 rows containing missing values (geom_point).
## Warning: Removed 17 rows containing missing values (geom_point).
## Warning: Removed 85 rows containing missing values (geom_point).
## Warning: Removed 85 rows containing missing values (geom_point).
pval_calculator_lab10 <- function(pheno_input, xa_input, xd_input, z_input){
n_samples <- dim(Xa)[1]
# Set up random variables for null (Z_mx) and with genotypes (XZ_mx)
Z_mx <- cbind(1,z_input) # H0 (w/ covariates)
XZ_mx <- cbind(1,xa_input,xd_input,z_input) # w/ genotypes too
# Calculate MLE betas for both null model and model with genotypes and covariates
MLE_beta_theta0 <- ginv(t(Z_mx) %*% Z_mx) %*% t(Z_mx) %*% pheno_input
MLE_beta_theta1 <- ginv(t(XZ_mx) %*% XZ_mx) %*% t(XZ_mx) %*% pheno_input
# Get Y estimates using the betas calculated above to give each hypothesis its best chance
y_hat_theta0 <- Z_mx %*% MLE_beta_theta0
y_hat_theta1 <- XZ_mx %*% MLE_beta_theta1
# Get the variance between the true phenotype values and our estimates under each hypothesis
SSE_theta0 <- sum((pheno_input - y_hat_theta0)^2)
SSE_theta1 <- sum((pheno_input - y_hat_theta1)^2)
# Set degrees of freedom
df_M <- 2
df_E <- n_samples - 3
# Put together calculated terms to get Fstatistic
Fstatistic <- ((SSE_theta0-SSE_theta1)/df_M) / (SSE_theta1/df_E)
# Determine pval of the Fstatistic
pval <- pf(Fstatistic, df_M, df_E,lower.tail = FALSE)
return(pval)
}
covar$GBR <- ifelse(covar$Population == 'GBR',1,0)
covar$FIN <- ifelse(covar$Population == 'FIN',1,0)
covar$CEU <- ifelse(covar$Population == 'CEU',1,0)
covar$TSI <- ifelse(covar$Population == 'TSI',1,0)
covar$MALE <- ifelse(covar$Sex == 'MALE',1,0)
covar$FEMALE <- ifelse(covar$Sex == 'FEMALE',1,0)
Xz = as.matrix(covar[,c(3:8)])
pval_mx2 <- matrix(0,nrow = dim(geno)[2], ncol = dim(pheno)[2])
pval_df2 <- as.data.frame(pval_mx2)
rownames(pval_df2) <- colnames(geno)
colnames(pval_df2) <- colnames(pheno)
for (i in 1:dim(pheno)[2]){
for (j in 1:dim(geno)[2]){
pval <- pval_calculator_lab10(pheno[,i], Xa[,j], Xd[,j], Xz)
pval_df2[j,i] <- pval
}
}
autoplot(pval_mx2, main = 'p-value heatmap for linear regression model with covariates')
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
ggsave(paste0('./p-value heatmap_2.pdf'), width=6, height=4)
rownames(pval_df2) <- colnames(geno)
pval_df2 <- merge(pval_df2, snp, by = 'row.names', sort = FALSE)
rownames(pval_df2) <- pval_df2$Row.names
pval_df2 <- pval_df2[,-1]
pval_df2$chr <- as.factor(paste0('chr',pval_df2$chromosome))
pval_df2$locus = c(1:N)
for (i in 1:dim(pheno)[2]){
p <- ggplot(data = pval_df2) + geom_point(aes(x = locus, y = -log10(pval_df2[,i]), color = chr))
p <- p + xlab("locus")
p <- p + ylab("-log10(p-value)")
p <- p + ggtitle(paste0('Manhattan Plot for gene',i,'_' , symbol[i], ' after modeling covariates'))
p <- p + geom_hline(yintercept = -log10(0.05/N), color = 'red')
plot(p)
ggsave(paste0('./gene',i,'_', symbol[i],'_manhattan_plot_covar.pdf'), width=12, height=3)
}
compute the potential causal SNPs for each gene
gene1_sig_covar <- rownames(pval_df2)[which(pval_df2[,1] < 0.05/N)] ##ENSG00000164308.12_ERAP2
gene1_sig_idx_covar <- which(pval_df2[,1] < 0.05/N)
length(gene1_sig_covar)
## [1] 74
gene2_sig_covar <- rownames(pval_df2)[which(pval_df2[,2] < 0.05/N)] ## ENSG00000124587.9_PEX6
gene2_sig_idx_covar <- which(pval_df2[,2] < 0.05/N)
length(gene2_sig_covar)
## [1] 27
gene3_sig_covar <- rownames(pval_df2)[which(pval_df2[,3] < 0.05/N)] ## ENSG00000180185.7_FAHD1
gene3_sig_idx_covar <- which(pval_df2[,3] < 0.05/N)
length(gene3_sig_covar)
## [1] 90
gene4_sig_covar <- rownames(pval_df2)[which(pval_df2[,4] < 0.05/N)] ## ENSG00000168827.9_GFM1
gene4_sig_idx_covar <- which(pval_df2[,4] < 0.05/N)
length(gene4_sig_covar)
## [1] 0
gene5_sig_covar <- rownames(pval_df2)[which(pval_df2[,5] < 0.05/N)] ## ENSG00000136536.9_MARCH7
gene5_sig_idx_covar <- which(pval_df2[,5] < 0.05/N)
length(gene5_sig_covar)
## [1] 0
expected_pval = sort(-log10(seq(0,1,length.out = N)))
for (i in 1:dim(pheno)[2]){
p <- ggplot() + geom_point(aes(x = expected_pval, y = sort(-log10(pval_df2[,i]))))
p <- p + xlab("-log10(expected p-values)")
p <- p + ylab("-log10(observed p-values)")
p <- p + ggtitle(paste0("QQ plot for gene",i,'_' , symbol[i], ' after modeling covariates'))
p <- p + geom_abline(slope = 1, intercept = 0, color = 'red')
plot(p)
ggsave(paste0('./gene',i,'_', symbol[i],'_QQ_plot_covar.pdf'), width=5, height=4)
}
for (i in 1:3){
p <- ggplot(data = pval_df2) + geom_point(aes(x = expected_pval, y = sort(-log10(pval_df2[,i]))))
p <- p + xlab("-log10(expected p-values)")
p <- p + ylab("-log10(observed p-values)")
p <- p + ggtitle(paste0("QQ plot for gene",i,'_' , symbol[i], ' after modeling covariates'))
p <- p + geom_abline(slope = 1, intercept = 0, color = 'red')
p <- p + ylim(c(0,10))
plot(p)
ggsave(paste0('./gene',i, '_', symbol[i], '_QQ_plot_covar_2.pdf'), width=5, height=4)
}
## Warning: Removed 53 rows containing missing values (geom_point).
## Warning: Removed 53 rows containing missing values (geom_point).
## Warning: Removed 17 rows containing missing values (geom_point).
## Warning: Removed 17 rows containing missing values (geom_point).
## Warning: Removed 85 rows containing missing values (geom_point).
## Warning: Removed 85 rows containing missing values (geom_point).
geno_HW <- as.numeric()
Xa_1 <- Xa
add_mx <- rbind(rep(-1,N), rep(0,N), rep(1,N))
Xa_1 <- rbind(Xa_1, add_mx)
Xa_1_summary <- apply(Xa_1, 2, table) ## 14 59 271
allele_mx <- matrix((unlist(Xa_1_summary)-1), nrow=dim(geno)[2], byrow=T)
colnames(allele_mx) <- c('AA', 'AB', 'BB')
Exact.pvalues <- HWExactStats(allele_mx,x.linked=FALSE)
sum(Exact.pvalues < 0.05)
## [1] 3436
idx <- which(Exact.pvalues > 0.05)
gene1_sig_HW <- intersect(gene1_sig_covar, colnames(geno)[idx])
length(gene1_sig_HW)
## [1] 67
gene1_sig_HW_idx <- intersect(gene1_sig_idx_covar, idx)
gene2_sig_HW <- intersect(gene2_sig_covar, colnames(geno)[idx])
length(gene2_sig_HW)
## [1] 26
gene2_sig_HW_idx <- intersect(gene2_sig_idx_covar, idx)
gene3_sig_HW <- intersect(gene3_sig_covar, colnames(geno)[idx])
length(gene3_sig_HW)
## [1] 82
gene3_sig_HW_idx <- intersect(gene3_sig_idx_covar, idx)
pval_df2$gene1_sig <- ifelse(rownames(pval_df2)%in%gene1_sig_HW, 'sig', 'nosig')
pval_df2$gene2_sig <- ifelse(rownames(pval_df2)%in%gene2_sig_HW, 'sig', 'nosig')
pval_df2$gene3_sig <- ifelse(rownames(pval_df2)%in%gene3_sig_HW, 'sig', 'nosig')
cols <- c('sig' = 'red', 'nosig' = "black")
pval_df2_gene1_sig <- pval_df2[which(pval_df2$gene1_sig == 'sig'),]
unique(pval_df2_gene1_sig$chromosome) ## chr5
## [1] 5
p1 <- ggplot(data = pval_df2) + geom_point(aes(x = locus, y = -log10(pval_df2[,1]), color = gene1_sig))
p1 <- p1 + scale_colour_manual(values = cols)
p1 <- p1 + xlab("locus") + ylab("-log10(p-value)")
p1 <- p1 + ggtitle('Manhattan Plot for gene1_ERAP2_local')
p1 <- p1 + geom_hline(yintercept = -log10(0.05/N), color = 'red')
p1 <- p1 + xlim(range(gene1_sig_HW_idx)[1]-100, range(gene1_sig_HW_idx)[2]+100)
plot(p1)
## Warning: Removed 49718 rows containing missing values (geom_point).
ggsave('./gene1_ERAP2_manhattan_plot_local.pdf', width=5, height=4)
## Warning: Removed 49718 rows containing missing values (geom_point).
gene1_sig_chr <- unique(pval_df2_gene1_sig$chromosome)
gene1_sig_loc <- pval_df2_gene1_sig$position
range(gene1_sig_loc)
## [1] 96772432 97110808
gene1_sig_loc_len <- range(gene1_sig_loc)[2] - range(gene1_sig_loc)[1]
gene1_sig_loc_len
## [1] 338376
print(paste0('The potential ', length(gene1_sig_loc), ' causal SNPs for gene1_ERAP2 are located on chromosome ', gene1_sig_chr, ', spans from ', range(gene1_sig_loc)[1], ' to ', range(gene1_sig_loc)[2], ', covers ', gene1_sig_loc_len, ' bp'))
## [1] "The potential 67 causal SNPs for gene1_ERAP2 are located on chromosome 5, spans from 96772432 to 97110808, covers 338376 bp"
cols <- c('sig' = 'red', 'nosig' = "black")
pval_df2_gene2_sig <- pval_df2[which(pval_df2$gene2_sig == 'sig'),]
unique(pval_df2_gene2_sig$chromosome) ## chr4 & chr6 ## chr4:"rs6854915"
## [1] 4 6
p2 <- ggplot(data = pval_df2) + geom_point(aes(x = locus, y = -log10(pval_df2[,2]), color = gene2_sig))
p2 <- p2 + scale_colour_manual(values = cols)
p2 <- p2 + xlab("locus") + ylab("-log10(p-value)")
p2 <- p2 + ggtitle('Manhattan Plot for gene2_PEX6_local')
p2 <- p2 + geom_hline(yintercept = -log10(0.05/N), color = 'red')
p2 <- p2 + xlim(range(gene2_sig_HW_idx)[1]-100, range(gene2_sig_HW_idx)[2]+100)
plot(p2)
## Warning: Removed 43877 rows containing missing values (geom_point).
ggsave('./gene2_PEX6_manhattan_plot_zoom_in.pdf', width=5, height=4)
## Warning: Removed 43877 rows containing missing values (geom_point).
cols <- c('sig' = 'red', 'nosig' = "black")
p2 <- ggplot(data = pval_df2) + geom_point(aes(x = locus, y = -log10(pval_df2[,2]), color = gene2_sig))
p2 <- p2 + scale_colour_manual(values = cols)
p2 <- p2 + xlab("locus") + ylab("-log10(p-value)")
p2 <- p2 + ggtitle('Manhattan Plot for gene2_PEX6_local_2')
p2 <- p2 + geom_hline(yintercept = -log10(0.05/N), color = 'red')
p2 <- p2 + xlim(range(gene2_sig_HW_idx[2:26])[1]-100, range(gene2_sig_HW_idx[2:26])[2]+100)
plot(p2)
## Warning: Removed 49772 rows containing missing values (geom_point).
ggsave('./gene2_PEX6_manhattan_plot_zoom_in_2.pdf', width=5, height=4)
## Warning: Removed 49772 rows containing missing values (geom_point).
gene2_sig_chr <- unique(pval_df2_gene2_sig[2:26,]$chromosome)
gene2_sig_loc <- pval_df2_gene2_sig[c(2:26),]$position
range(gene2_sig_loc)
## [1] 42889467 43108015
gene2_sig_loc_len <- range(gene2_sig_loc)[2] - range(gene2_sig_loc)[1]
gene2_sig_loc_len
## [1] 218548
print(paste0('The potential ', length(gene2_sig_loc), ' causal SNPs for gene2_PEX6 are located on chromosome ', gene2_sig_chr, ', spans from ', range(gene2_sig_loc)[1], ' to ', range(gene2_sig_loc)[2], ', covers ', gene2_sig_loc_len, ' bp'))
## [1] "The potential 25 causal SNPs for gene2_PEX6 are located on chromosome 6, spans from 42889467 to 43108015, covers 218548 bp"
cols <- c('sig' = 'red', 'nosig' = "black")
pval_df2_gene3_sig <- pval_df2[which(pval_df2$gene3_sig == 'sig'),]
unique(pval_df2_gene3_sig$chromosome) ## chr16
## [1] 16
p3 <- ggplot(data = pval_df2) + geom_point(aes(x = locus, y = -log10(pval_df2[,3]), color = gene3_sig))
p3 <- p3 + scale_colour_manual(values = cols)
p3 <- p3 + xlab("locus") + ylab("-log10(p-value)")
p3 <- p3 + ggtitle('Manhattan Plot for gene3_FAHD1_local')
p3 <- p3 + geom_hline(yintercept = -log10(0.05/N), color = 'red')
p3 <- p3 + xlim(range(gene3_sig_HW_idx)[1]-100, range(gene3_sig_HW_idx)[2]+100)
plot(p3)
## Warning: Removed 49700 rows containing missing values (geom_point).
ggsave('./gene3_FAHD1_manhattan_plot_local.pdf', width=5, height=4)
## Warning: Removed 49700 rows containing missing values (geom_point).
gene3_sig_chr <- unique(pval_df2_gene3_sig$chromosome)
gene3_sig_loc <- pval_df2_gene3_sig$position
range(gene3_sig_loc)
## [1] 1524250 1929366
gene3_sig_loc_len <- range(gene3_sig_loc)[2] - range(gene3_sig_loc)[1]
gene3_sig_loc_len
## [1] 405116
print(paste0('The potential ', length(gene3_sig_loc), ' causal SNPs for gene3_FAHD1 are located on chromosome ', gene3_sig_chr, ', spans from ', range(gene3_sig_loc)[1], ' to ', range(gene3_sig_loc)[2], ', covers ', gene3_sig_loc_len, ' bp'))
## [1] "The potential 82 causal SNPs for gene3_FAHD1 are located on chromosome 16, spans from 1524250 to 1929366, covers 405116 bp"
LD_plot <- function(index, gene){
get_upper_tri <- function(cormat){
cormat[lower.tri(cormat)]<- NA
return(cormat)
}
index1 <- range(index)[1]-50
index2 <- range(index)[2]+50
Xa_100 <- Xa[,c(index1:index2)]
num <- dim(Xa_100)[2]
corr_mx = matrix(0, nrow = num, ncol = num)
for (i in 1:num){
for (j in 1:num){
cor <- cor.test(Xa_100[,i], Xa_100[,j])
corr_mx[i,j] <- (cor$estimate)^2}}
upper_tri <- get_upper_tri(corr_mx)
melted_cormat <- melt(upper_tri, na.rm = TRUE)
p <- ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+ geom_tile()+ scale_fill_gradient2(low = "white", high = "red", limit = c(0,1), space = "Lab",name="r^2")
p <- p + theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1), axis.title.x = element_blank(), axis.title.y = element_blank(), panel.grid.major = element_blank(), panel.border = element_blank(), panel.background = element_blank())+ coord_fixed() + geom_segment(aes(x = 50, y = 50, xend = num-50, yend = 50), size = 0.5) + geom_segment(aes(x = num-50, y = 50, xend = num-50, yend = num-50), size = 0.5)
p <- p + ggtitle(paste0('Correlation matrix for gene', gene, '_', symbol[gene]))
ggsave(paste0('./correlation matrix for gene',gene,'_', symbol[gene],'.pdf'), width=5, height=4)
plot(p)
return(corr_mx)
}
corr_mx1 <- LD_plot(gene1_sig_HW_idx,1)
2 LD regions 1st covers 20 SNPs and 2nd covers 47 SNPs
rownames(pval_df2_gene1_sig)[1] ## "rs200641494"
## [1] "rs200641494"
pval_df2_gene1_sig$position[1] ## 96772432
## [1] 96772432
rownames(pval_df2_gene1_sig)[20] ## "rs70981851"
## [1] "rs70981851"
pval_df2_gene1_sig$position[20] ## 96848898
## [1] 96848898
rownames(pval_df2_gene1_sig)[21] ## "rs2549778"
## [1] "rs2549778"
pval_df2_gene1_sig$position[21] ## 96868550
## [1] 96868550
rownames(pval_df2_gene1_sig)[67] ## "rs72777610"
## [1] "rs72777610"
pval_df2_gene1_sig$position[67] ## 97110808
## [1] 97110808
gene1_sig_SNP <- rownames(pval_df2_gene1_sig)[21:67]
print('47 SNPs are identifed to be potential causal SNP for ERAP2')
## [1] "47 SNPs are identifed to be potential causal SNP for ERAP2"
gene1_sig_SNP
## [1] "rs2549778" "rs3840523" "rs1230363" "rs12189125" "rs12653964"
## [6] "rs1559354" "rs201477313" "rs201109800" "rs4869314" "rs10707238"
## [11] "rs2549784" "rs2161657" "rs6859160" "rs251339" "rs1423568"
## [16] "rs2549791" "rs2548525" "rs13163165" "rs2549801" "rs2910688"
## [21] "rs1363907" "rs1230382" "rs1216571" "rs1216568" "rs1046395"
## [26] "rs2548226" "rs6887500" "rs7726445" "rs1477364" "rs7731592"
## [31] "rs7723899" "rs7716222" "rs10058476" "rs140336797" "rs2161548"
## [36] "rs6879678" "rs201866654" "rs9918183" "rs1160962" "rs199514005"
## [41] "rs27306" "rs27307" "rs11414909" "rs27292" "rs27747"
## [46] "rs248215" "rs72777610"
corr_mx2 <- LD_plot(gene2_sig_HW_idx[2:26],2)
2 LD regions 1st covers 13 SNPs and 2nd covers 11 SNPs
pval_df2_gene2_sig$position[2] ## 42889467
## [1] 42889467
pval_df2_gene2_sig$position[14] ## 42946612
## [1] 42946612
pval_df2_gene2_sig$position[15] ## 42949278
## [1] 42949278
pval_df2_gene2_sig$position[25] ## 42977844
## [1] 42977844
pval_df2_gene2_sig$position[26] ## 43108015
## [1] 43108015
gene2_sig_SNP <- rownames(pval_df2_gene2_sig)[15:25]
print('11 SNPs are identifed to be potential causal SNP for PEX6')
## [1] "11 SNPs are identifed to be potential causal SNP for PEX6"
gene2_sig_SNP
## [1] "rs4714638" "rs9471976" "rs58497441" "rs13215983" "rs6920547"
## [6] "rs7760250" "rs1129187" "rs3805953" "rs10948061" "rs9471985"
## [11] "rs199921136"
calculate the correlation between the potential causal SNP for gene2_PEX6 in chr4 and the remaining 25 SNPs in chr6
geno_gene2 <- geno[,gene2_sig_HW_idx]
corr_2c = c()
for (i in 1:26){
cor <- cor.test(geno_gene2[,i], geno_gene2[,1])
corr_2c <- c(corr_2c, (cor$estimate)^2)
}
corr_2c
## cor cor cor cor cor
## 1.0000000000 0.0011806704 0.0013166376 0.0008190977 0.0006665384
## cor cor cor cor cor
## 0.0006665384 0.0006665384 0.0017531422 0.0017531422 0.0028664568
## cor cor cor cor cor
## 0.0058871454 0.0097047428 0.0074004589 0.0061571783 0.0403803305
## cor cor cor cor cor
## 0.0441912674 0.0423251303 0.0449502338 0.0410445826 0.0449502338
## cor cor cor cor cor
## 0.0366164687 0.0449502338 0.0366164687 0.0457212415 0.0228560051
## cor
## 0.0228664106
corr_mx3 <- LD_plot(gene3_sig_HW_idx,3)
gene3_sig_SNP <- rownames(pval_df2_gene3_sig)
print('82 SNPs are identifed to be potential causal SNP for FAHD1')
## [1] "82 SNPs are identifed to be potential causal SNP for FAHD1"
gene3_sig_SNP
## [1] "rs2235639" "rs75276217" "rs7201813" "rs1178435" "rs2575352"
## [6] "rs1065663" "rs3751893" "rs2745205" "rs2575345" "rs2745195"
## [11] "rs142797272" "rs145338194" "rs2473469" "rs1657107" "rs1629534"
## [16] "rs7186249" "rs8046750" "rs9935687" "rs9928566" "rs2575357"
## [21] "rs62038378" "rs2076455" "rs2268674" "rs3760040" "rs1657152"
## [26] "rs28364708" "rs1143034" "rs11644748" "rs8044343" "rs34392705"
## [31] "rs11641513" "rs140254902" "rs12325082" "rs9652776" "rs11643835"
## [36] "rs3813760" "rs112311350" "rs141551352" "rs4598914" "rs62038431"
## [41] "rs62038440" "rs57530073" "rs138033584" "rs2575359" "rs1742423"
## [46] "rs1657118" "rs1742432" "rs410465" "rs449530" "rs388627"
## [51] "rs433268" "rs366223" "rs2492879" "rs2492881" "rs391543"
## [56] "rs150843657" "rs368188" "rs1742446" "rs4451947" "rs2437747"
## [61] "rs2575335" "rs2369275" "rs2815297" "rs6600180" "rs3952970"
## [66] "rs1657100" "rs2974857" "rs2982447" "rs2974860" "rs2917517"
## [71] "rs2906903" "rs1632124" "rs2982433" "rs1657139" "rs8059871"
## [76] "rs740466" "rs1742401" "rs344364" "rs344366" "rs344369"
## [81] "rs2575358" "rs58530366"
#gene1_sig_HW_idx
expr_plot <- function(gene, snp){
data <- data.frame('gene1_expr' = pheno[,gene], 'geno' = as.factor(geno[,snp]))
levels(data$geno) <- c('AA', 'AB', 'BB')
rownames(data) <- rownames(pheno)
p <- ggplot(data, aes(x = geno, y = gene1_expr, fill = geno)) + geom_boxplot() + geom_point(position=position_jitterdodge()) + ggtitle(paste0('Boxplot for expression of gene', gene, '_',symbol[gene],' at snp ', colnames(geno)[snp]))
p <- p + xlab('Genotype')
p <- p + ylab('Relative expression')
#plot(p)
ggsave(paste0('./gene',gene,'_',symbol[gene], ' at snp ', colnames(geno)[snp], '.pdf'), width=5, height=4)
}
#expr_plot(1,1)
for (i in 1:length(gene1_sig_HW_idx)){
expr_plot(1, gene1_sig_HW_idx[i])
}
for (i in 1:length(gene2_sig_HW_idx)){
expr_plot(2, gene2_sig_HW_idx[i])
}
for (i in 1:length(gene3_sig_HW_idx)){
expr_plot(3, gene3_sig_HW_idx[i])
}